View Javadoc

1   /*
2   jMimeMagic(TM) is a Java library for determining the content type of files or
3   streams.
4   
5   Copyright (C) 2004 David Castro
6   
7   This library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10  version 2.1 of the License, or (at your option) any later version.
11  
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  Lesser General Public License for more details.
16  
17  You should have received a copy of the GNU Lesser General Public
18  License along with this library; if not, write to the Free Software
19  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  
21  For more information, please email arimus@users.sourceforge.net
22  */
23  package net.sf.jmimemagic;
24  
25  import org.apache.commons.logging.Log;
26  import org.apache.commons.logging.LogFactory;
27  
28  import org.xml.sax.Attributes;
29  import org.xml.sax.ContentHandler;
30  import org.xml.sax.ErrorHandler;
31  import org.xml.sax.SAXException;
32  import org.xml.sax.SAXNotRecognizedException;
33  import org.xml.sax.SAXNotSupportedException;
34  import org.xml.sax.SAXParseException;
35  import org.xml.sax.XMLReader;
36  import org.xml.sax.helpers.DefaultHandler;
37  import org.xml.sax.helpers.XMLReaderFactory;
38  
39  import java.io.ByteArrayOutputStream;
40  
41  import java.nio.ByteBuffer;
42  
43  import java.util.ArrayList;
44  import java.util.Collection;
45  import java.util.HashMap;
46  
47  
48  /***
49   * DOCUMENT ME!
50   *
51   * @author $Author$
52   * @version $Revision$
53    */
54  public class MagicParser extends DefaultHandler implements ContentHandler, ErrorHandler
55  {
56      private static String magicFile = "/magic.xml";
57      private static Log log = LogFactory.getLog(MagicParser.class);
58  
59      // Namespaces feature id (http://xml.org/sax/features/namespaces).
60      protected static final String NAMESPACES_FEATURE_ID = "http://xml.org/sax/features/namespaces";
61  
62      // Validation feature id (http://xml.org/sax/features/validation). 
63      protected static final String VALIDATION_FEATURE_ID = "http://xml.org/sax/features/validation";
64  
65      // Schema validation feature id (http://apache.org/xml/features/validation/schema). 
66      protected static final String SCHEMA_VALIDATION_FEATURE_ID = "http://apache.org/xml/features/validation/schema";
67  
68      // Schema full checking feature id (http://apache.org/xml/features/validation/schema-full-checking). 
69      protected static final String SCHEMA_FULL_CHECKING_FEATURE_ID = "http://apache.org/xml/features/validation/schema-full-checking";
70  
71      // Default parser name. 
72      protected static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
73  
74      // Default namespaces support (true). 
75      protected static final boolean DEFAULT_NAMESPACES = true;
76  
77      // Default validation support (false). 
78      protected static final boolean DEFAULT_VALIDATION = false;
79  
80      // Default Schema validation support (false). 
81      protected static final boolean DEFAULT_SCHEMA_VALIDATION = false;
82  
83      // Default Schema full checking support (false). 
84      protected static final boolean DEFAULT_SCHEMA_FULL_CHECKING = false;
85      private boolean initialized = false;
86      private XMLReader parser = null;
87      private ArrayList stack = new ArrayList();
88      private Collection matchers = new ArrayList();
89      private MagicMatcher matcher = null;
90      private MagicMatch match = null;
91      private HashMap properties = null;
92      private String finalValue = "";
93      private boolean isMimeType = false;
94      private boolean isExtension = false;
95      private boolean isDescription = false;
96      private boolean isTest = false;
97  
98      /*** 
99       * constructor 
100      */
101     public MagicParser()
102     {
103         log.debug("instantiated");
104     }
105 
106     /***
107      * parse the xml file and create our MagicMatcher object list
108      *
109      * @throws MagicParseException DOCUMENT ME!
110      */
111     public synchronized void initialize()
112         throws MagicParseException
113     {
114         boolean namespaces = DEFAULT_NAMESPACES;
115         boolean validation = DEFAULT_VALIDATION;
116         boolean schemaValidation = DEFAULT_SCHEMA_VALIDATION;
117         boolean schemaFullChecking = DEFAULT_SCHEMA_FULL_CHECKING;
118 
119         if (!initialized) {
120             // use default parser
121             try {
122                 parser = XMLReaderFactory.createXMLReader();
123             } catch (Exception e) {
124                 try {
125                     log.debug("falling back to default parser: " + DEFAULT_PARSER_NAME);
126                     parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER_NAME);
127                 } catch (Exception ee) {
128                     throw new MagicParseException("unable to instantiate parser");
129                 }
130             }
131 
132             // set parser features
133             try {
134                 parser.setFeature(NAMESPACES_FEATURE_ID, namespaces);
135             } catch (SAXException e) {
136                 log.debug("initialize(): warning: Parser does not support feature (" +
137                     NAMESPACES_FEATURE_ID + ")");
138             }
139 
140             try {
141                 parser.setFeature(VALIDATION_FEATURE_ID, validation);
142             } catch (SAXException e) {
143                 log.debug("initialize(): warning: Parser does not support feature (" +
144                     VALIDATION_FEATURE_ID + ")");
145             }
146 
147             try {
148                 parser.setFeature(SCHEMA_VALIDATION_FEATURE_ID, schemaValidation);
149             } catch (SAXNotRecognizedException e) {
150                 // ignore
151             } catch (SAXNotSupportedException e) {
152                 log.debug("initialize(): warning: Parser does not support feature (" +
153                     SCHEMA_VALIDATION_FEATURE_ID + ")");
154             }
155 
156             try {
157                 parser.setFeature(SCHEMA_FULL_CHECKING_FEATURE_ID, schemaFullChecking);
158             } catch (SAXNotRecognizedException e) {
159                 // ignore
160             } catch (SAXNotSupportedException e) {
161                 log.debug("initialize(): warning: Parser does not support feature (" +
162                     SCHEMA_FULL_CHECKING_FEATURE_ID + ")");
163             }
164 
165             // set handlers
166             parser.setErrorHandler(this);
167             parser.setContentHandler(this);
168 
169             // parse file
170             try {
171                 // get the magic file URL
172                 String magicURL = MagicParser.class.getResource(magicFile).toString();
173 
174                 if (magicURL == null) {
175                     log.error("initialize(): couldn't load '" + magicURL + "'");
176                     throw new MagicParseException("couldn't load '" + magicURL + "'");
177                 }
178 
179                 parser.parse(magicURL);
180             } catch (SAXParseException e) {
181                 // ignore
182             } catch (Exception e) {
183                 e.printStackTrace();
184                 throw new MagicParseException("parse error occurred - " + e.getMessage());
185             }
186 
187             initialized = true;
188         }
189     }
190 
191     /***
192      * DOCUMENT ME!
193      *
194      * @return DOCUMENT ME!
195      */
196     public Collection getMatchers()
197     {
198         return matchers;
199     }
200 
201     /***
202      * DOCUMENT ME!
203      *
204      * @throws SAXException DOCUMENT ME!
205      */
206     public void startDocument()
207         throws SAXException
208     {
209         log.debug("startDocument()");
210     }
211 
212     /***
213      * DOCUMENT ME!
214      *
215      * @throws SAXException DOCUMENT ME!
216      */
217     public void endDocument()
218         throws SAXException
219     {
220         log.debug("endDocument()");
221     }
222 
223     /***
224      * DOCUMENT ME!
225      *
226      * @param target DOCUMENT ME!
227      * @param data DOCUMENT ME!
228      *
229      * @throws SAXException DOCUMENT ME!
230      */
231     public void processingInstruction(String target, String data)
232         throws SAXException
233     {
234         // do nothing
235     }
236 
237     /***
238      * DOCUMENT ME!
239      *
240      * @param ch DOCUMENT ME!
241      * @param offset DOCUMENT ME!
242      * @param length DOCUMENT ME!
243      *
244      * @throws SAXException DOCUMENT ME!
245      */
246     public void characters(char[] ch, int offset, int length)
247         throws SAXException
248     {
249         String value = new String(ch, offset, length);
250         log.debug("characters(): value is '" + value + "'");
251 
252         finalValue += value;
253     }
254 
255     /***
256      * DOCUMENT ME!
257      *
258      * @param ch DOCUMENT ME!
259      * @param offset DOCUMENT ME!
260      * @param length DOCUMENT ME!
261      *
262      * @throws SAXException DOCUMENT ME!
263      */
264     public void ignorableWhitespace(char[] ch, int offset, int length)
265         throws SAXException
266     {
267         // do nothing
268     }
269 
270     /***
271      * DOCUMENT ME!
272      *
273      * @param uri DOCUMENT ME!
274      * @param localName DOCUMENT ME!
275      * @param qname DOCUMENT ME!
276      * @param attributes DOCUMENT ME!
277      *
278      * @throws SAXException DOCUMENT ME!
279      */
280     public void startElement(String uri, String localName, String qname, Attributes attributes)
281         throws SAXException
282     {
283         log.debug("startElement()");
284         log.debug("startElement(): localName is '" + localName + "'");
285 
286         // create a new matcher
287         if (localName.equals("match")) {
288             log.debug("startElement(): creating new matcher");
289             // match to hold data
290             match = new MagicMatch();
291             // our matcher
292             matcher = new MagicMatcher();
293             matcher.setMatch(match);
294         }
295 
296         // these are subelements of matcher, but also occur elsewhere
297         if (matcher != null) {
298             if (localName.equals("mimetype")) {
299                 isMimeType = true;
300             } else if (localName.equals("extension")) {
301                 isExtension = true;
302             } else if (localName.equals("description")) {
303                 isDescription = true;
304             } else if (localName.equals("test")) {
305                 isTest = true;
306 
307                 int length = attributes.getLength();
308 
309                 for (int i = 0; i < length; i++) {
310                     String attrLocalName = attributes.getLocalName(i);
311                     String attrValue = attributes.getValue(i);
312 
313                     if (attrLocalName.equals("offset")) {
314                         if (!attrValue.equals("")) {
315                             match.setOffset(new Integer(attrValue).intValue());
316                             log.debug("startElement():   setting offset to '" + attrValue + "'");
317                         }
318                     } else if (attrLocalName.equals("length")) {
319                         if (!attrValue.equals("")) {
320                             match.setLength(new Integer(attrValue).intValue());
321                             log.debug("startElement():   setting length to '" + attrValue + "'");
322                         }
323                     } else if (attrLocalName.equals("type")) {
324                         match.setType(attrValue);
325                         log.debug("startElement():   setting type to '" + attrValue + "'");
326                     } else if (attrLocalName.equals("bitmask")) {
327                         if (!attrValue.equals("")) {
328                             match.setBitmask(attrValue);
329                             log.debug("startElement():   setting bitmask to '" + attrValue + "'");
330                         }
331                     } else if (attrLocalName.equals("comparator")) {
332                         match.setComparator(attrValue);
333                         log.debug("startElement():   setting comparator to '" + attrValue + "'");
334                     }
335                 }
336             } else if (localName.equals("property")) {
337                 int length = attributes.getLength();
338                 String name = null;
339                 String value = null;
340 
341                 for (int i = 0; i < length; i++) {
342                     String attrLocalName = attributes.getLocalName(i);
343                     String attrValue = attributes.getValue(i);
344 
345                     if (attrLocalName.equals("name")) {
346                         if (!attrValue.equals("")) {
347                             name = attrValue;
348                         }
349                     } else if (attrLocalName.equals("value")) {
350                         if (!attrValue.equals("")) {
351                             value = attrValue;
352                         }
353                     }
354                 }
355 
356                 // save the property to our map
357                 if ((name != null) && (value != null)) {
358                     if (properties == null) {
359                         properties = new HashMap();
360                     }
361 
362                     if (!properties.containsKey(name)) {
363                         properties.put(name, value);
364                         log.debug("startElement():   setting property '" + name + "'='" + value +
365                             "'");
366                     } else {
367                         log.debug("startElement():   not setting property '" + name +
368                             "', duplicate key");
369                     }
370                 }
371             } else if (localName.equals("match-list")) {
372                 log.debug("startElement(): found submatcher list");
373 
374                 // this means we are processing a child match, so we need to push
375                 // the existing match on the stack
376                 log.debug("startElement(): pushing current matcher to stack");
377                 stack.add(matcher);
378             } else {
379                 // we don't care about this type
380             }
381         }
382     }
383 
384     /***
385      * DOCUMENT ME!
386      *
387      * @param uri DOCUMENT ME!
388      * @param localName DOCUMENT ME!
389      * @param qname DOCUMENT ME!
390      *
391      * @throws SAXException DOCUMENT ME!
392      */
393     public void endElement(String uri, String localName, String qname)
394         throws SAXException
395     {
396         log.debug("endElement()");
397         log.debug("endElement(): localName is '" + localName + "'");
398 
399         // determine which tag these chars are for and save them
400         if (isMimeType) {
401             isMimeType = false;
402             match.setMimeType(finalValue);
403             log.debug("characters(): setting mimetype to '" + finalValue + "'");
404         } else if (isExtension) {
405             isExtension = false;
406             match.setExtension(finalValue);
407             log.debug("characters(): setting extension to '" + finalValue + "'");
408         } else if (isDescription) {
409             isDescription = false;
410             match.setDescription(finalValue);
411             log.debug("characters(): setting description to '" + finalValue + "'");
412         } else if (isTest) {
413             isTest = false;
414             match.setTest(convertOctals(finalValue));
415             log.debug("characters(): setting test to '" + convertOctals(finalValue) + "'");
416         } else {
417             // do nothing
418         }
419 
420         finalValue = "";
421 
422         // need to save the current matcher here if it is filled out enough and
423         // we have an /matcher
424         if (localName.equals("match")) {
425             // FIXME - make sure the MagicMatcher isValid() test works
426             if (matcher.isValid()) {
427                 // set the collected properties on this matcher
428                 match.setProperties(properties);
429 
430                 // add root match
431                 if (stack.size() == 0) {
432                     log.debug("endElement(): adding root matcher");
433                     matchers.add(matcher);
434                 } else {
435                     // we need to add the match to it's parent which is on the
436                     // stack
437                     log.debug("endElement(): adding sub matcher");
438 
439                     MagicMatcher m = (MagicMatcher) stack.get(stack.size() - 1);
440                     m.addSubMatcher(matcher);
441                 }
442             } else {
443                 // don't add invalid matchers
444                 log.info("endElement(): not adding invalid matcher '" + match.getDescription() +
445                     "'");
446             }
447 
448             matcher = null;
449             properties = null;
450 
451             // restore matcher from the stack if we have an /matcher-list
452         } else if (localName.equals("match-list")) {
453             if (stack.size() > 0) {
454                 log.debug("endElement(): popping from the stack");
455                 matcher = (MagicMatcher) stack.get(stack.size() - 1);
456                 // pop from the stack
457                 stack.remove(matcher);
458             }
459         } else if (localName.equals("mimetype")) {
460             isMimeType = false;
461         } else if (localName.equals("extension")) {
462             isExtension = false;
463         } else if (localName.equals("description")) {
464             isDescription = false;
465         } else if (localName.equals("test")) {
466             isTest = false;
467         }
468     }
469 
470     /***
471      * DOCUMENT ME!
472      *
473      * @param ex DOCUMENT ME!
474      *
475      * @throws SAXException DOCUMENT ME!
476      */
477     public void warning(SAXParseException ex)
478         throws SAXException
479     {
480         // FIXME
481     }
482 
483     /***
484      * DOCUMENT ME!
485      *
486      * @param ex DOCUMENT ME!
487      *
488      * @throws SAXException DOCUMENT ME!
489      */
490     public void error(SAXParseException ex)
491         throws SAXException
492     {
493         // FIXME
494         throw ex;
495     }
496 
497     /***
498      * DOCUMENT ME!
499      *
500      * @param ex DOCUMENT ME!
501      *
502      * @throws SAXException DOCUMENT ME!
503      */
504     public void fatalError(SAXParseException ex)
505         throws SAXException
506     {
507         // FIXME
508         throw ex;
509     }
510 
511     /***
512      * replaces octal representations of bytes, written as \ddd to actual byte values.
513      *
514      * @param s a string with encoded octals
515      *
516      * @return string with all octals decoded
517      */
518     private ByteBuffer convertOctals(String s)
519     {
520         int beg = 0;
521         int end = 0;
522         int c1;
523         int c2;
524         int c3;
525         int chr;
526         ByteArrayOutputStream buf = new ByteArrayOutputStream();
527 
528         while ((end = s.indexOf('//', beg)) != -1) {
529             if (s.charAt(end + 1) != '//') {
530                 //log.debug("appending chunk '"+s.substring(beg, end)+"'");
531                 for (int z = beg; z < end; z++) {
532                     buf.write((int) s.charAt(z));
533                 }
534 
535                 //log.debug("found // at position "+end);
536                 //log.debug("converting octal '"+s.substring(end, end+4)+"'");
537                 if ((end + 4) <= s.length()) {
538                     try {
539                         chr = Integer.parseInt(s.substring(end + 1, end + 4), 8);
540 
541                         //log.debug("converted octal '"+s.substring(end+1,end+4)+"' to '"+chr);
542                         //log.debug("converted octal back to '"+Integer.toOctalString(chr));
543 
544                         //log.debug("converted '"+s.substring(end+1,end+4)+"' to "+chr+"/"+((char)chr));
545                         buf.write(chr);
546                         beg = end + 4;
547                         end = beg;
548                     } catch (NumberFormatException nfe) {
549                         //log.debug("not an octal");
550                         buf.write((int) '//');
551                         beg = end + 1;
552                         end = beg;
553                     }
554                 } else {
555                     //log.debug("not an octal, not enough chars left in string");
556                     buf.write((int) '//');
557                     beg = end + 1;
558                     end = beg;
559                 }
560             } else {
561                 //log.debug("appending //");
562                 buf.write((int) '//');
563                 beg = end + 1;
564                 end = beg;
565             }
566         }
567 
568         if (end < s.length()) {
569             for (int z = beg; z < s.length(); z++) {
570                 buf.write((int) s.charAt(z));
571             }
572         }
573 
574         try {
575             log.debug("convertOctals(): returning buffer size '" + buf.size() + "'");
576 
577             ByteBuffer b = ByteBuffer.allocate(buf.size());
578 
579             return b.put(buf.toByteArray());
580         } catch (Exception e) {
581             log.error("convertOctals(): error parsing string: " + e);
582 
583             return ByteBuffer.allocate(0);
584         }
585     }
586 }